import os, itertools, functools
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scanpy as sc
configs = {}
configs['this'] = {}
configs['this']['id'] = '1_stats_qc_filtering'
configs['this']['project_dir'] = '/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot'
configs['this']['analysis_dir'] = f"{configs['this']['project_dir']}/analysis"
configs['this']['configs_file'] = f"{configs['this']['analysis_dir']}/configs/1_stats_qc_filtering_config.csv"
configs['this']['primers_file'] = f"{configs['this']['analysis_dir']}/configs/primers_sequences.csv"
configs['this']['irms_subm_metadata'] = {'CR138AE': 'TAP-seq', 'CR5AE': 'Gex'}
configs['this']['file_name_pattern'] = '_config.csv'
configs['this']['results_dir'] = f"{configs['this']['analysis_dir']}/results"
configs['this']['python_binary_objects_dir'] = f"{configs['this']['analysis_dir']}/python_binary_objects"
configs['this']['plots_dir'] = f"{configs['this']['analysis_dir']}/plots/{configs['this']['id']}"
def import_config(file_name: str) -> dict:
configs = pd.read_table(file_name, sep=',', header=None, names=['param', 'value'])
configs = configs.set_index('param').loc[:, 'value'].to_dict()
return configs
def extract_config_name(file: str, file_name_pattern: str) -> str:
config_name: str = os.path.basename(file).replace(file_name_pattern, '')
return config_name
def import_configs(root_dir, file_name_pattern) -> None:
import glob
glob_str: str = f'{root_dir}/**/*{file_name_pattern}'
file_names = glob.glob(glob_str, recursive=True)
config_names = map(
extract_config_name, file_names,
itertools.repeat(file_name_pattern)
)
configs = map(import_config, file_names)
configs = dict(zip(config_names, configs))
return configs
configs['imported'] = import_configs(
configs['this']['project_dir'],
configs['this']['file_name_pattern']
)
def create_unexisting_dirs(configs) -> None:
import re
configs_flat = pd.json_normalize(configs, sep='|')
configs_flat = configs_flat.to_dict(orient='records')[0]
for key, value in configs_flat.items():
if not re.search(r'.*_dir$', key): continue
if os.path.exists(value): continue
os.makedirs(value)
print('CREATED:', key, value)
create_unexisting_dirs(configs)
def print_notebook_setup(configs) -> None:
import json
for name in globals():
obj = eval(name)
if not hasattr(obj, '__version__'): continue
print(f'{obj.__name__}: {obj.__version__}')
environ = os.environ
environ = dict(zip(environ.keys(), environ.values()))
print(
f'\nCONFIGS\n{json.dumps(configs, sort_keys=True, indent=6)}\n'
f'\nENVIRONMENT\n{json.dumps(environ, sort_keys=True, indent=6)}'
)
print_notebook_setup(configs)
class Utils:
@staticmethod
def category_to_str(x: pd.DataFrame) -> pd.DataFrame:
non_cat = x.select_dtypes(exclude = 'category')
cat = x.select_dtypes('category').astype(np.str_)
x = pd.concat([cat, non_cat], axis = 1)
return x
@staticmethod
def str_to_category(x: pd.DataFrame) -> pd.DataFrame:
non_str = x.select_dtypes(exclude='object')
str_ = x.select_dtypes('object').astype('category')
x = pd.concat([str_, non_str], axis=1)
return x
@staticmethod
def remove_unused_categories(x: pd.DataFrame) -> pd.DataFrame:
non_cat = x.select_dtypes(exclude = 'category')
cat = (
x.select_dtypes('category')
.apply(lambda x: x.cat.remove_unused_categories())
)
x = pd.concat([cat, non_cat], axis = 1)
return x
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, facecolor='white')
plt.rcParams['font.size'] = 15
plt.rcParams['text.color'] = 'grey'
plt.rcParams['xtick.color'] = 'grey'
plt.rcParams['ytick.color'] = 'grey'
plt.rcParams['xtick.major.size'] = 10
plt.rcParams['ytick.major.size'] = 10
plt.rcParams['legend.frameon'] = False
plt.rcParams['legend.loc'] = 'upper left'
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.grid']=False
plt.rcParams['axes.labelcolor'] = 'grey'
plt.rcParams['axes.edgecolor'] = 'grey'
plt.rcParams['figure.constrained_layout.h_pad'] = 0.12
plt.rcParams['figure.constrained_layout.w_pad'] = 0.12
plt.rcParams['savefig.dpi'] = 400
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['figure.facecolor']='white'
plt.rcParams['axes.facecolor'] = 'white'
pandas: 1.4.1
numpy: 1.22.2
matplotlib: 3.5.0
scanpy: 1.8.2
CONFIGS
{
"imported": {
"1_stats_qc_filtering": {},
"processing": {
"anndata_objects_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_outputs/anndata_objects",
"barcode_suffix_separator": "__",
"configs_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/configs",
"count_bam_files_lines_cores": "8",
"count_bam_files_lines_mem_per_cpu": "1G",
"count_bam_files_lines_outputs_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_outputs/input_files_line_count/bam",
"count_bam_files_lines_time": "1-00:00:00",
"count_fastq_files_lines_mem_per_cpu": "1G",
"count_fastq_files_lines_outputs_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_outputs/input_files_line_count/fastq",
"count_fastq_files_lines_time": "1-00:00:00",
"data_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/data",
"experimental_metadata_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/configs/experimental_metadata.tsv",
"fastq_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/data/fastq",
"fastq_files_error_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/tmp/fastq_files_error.txt",
"fastq_files_regex_pattern": "([A-Z]{2}[0-9]+[A-Z]{2}[0-9]+P[0-9]+)(_.*)?_(S[0-9]+)_?(L[0-9]+)?_(R[0-9]+)_([0-9]+)(.*)",
"fastq_files_regex_pattern_library_id_pos": "1",
"fastq_files_regex_pattern_read_direction_pos": "5",
"fastq_files_table_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/tmp/fastq_files_table.tsv",
"fastq_urls_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/tmp/fastq_urls.txt",
"generate_anndata_environment": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1",
"generate_anndata_mem_per_cpu": "24G",
"generate_anndata_script": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/scripts/generate_anndata_object.py",
"generate_anndata_time": "2:00:00",
"job_scripts_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/job_scripts",
"job_standard_errors_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/job_standard_errors",
"job_standard_outputs_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/job_standard_outputs",
"jobs_stats_outputs_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_outputs/job_stats",
"jobs_stats_script": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/scripts/write_job_stats_tables.awk",
"logs_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/logs",
"min_read_count": "0",
"processing_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing",
"scripts_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/scripts",
"split_pipe_bam_file_relative_path": "process/barcode_headAligned_anno.bam",
"split_pipe_cell_metadata_relative_path": "all-sample/DGE_unfiltered/cell_metadata.csv",
"split_pipe_chemistry": "v1",
"split_pipe_cores": "10",
"split_pipe_counts_relative_path": "all-sample/DGE_unfiltered/count_matrix.mtx",
"split_pipe_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/data/split_pipe",
"split_pipe_environment": "/hpc/scratch/hdd1/mp674001/software/conda_envs/split_pipe_1.1",
"split_pipe_genes_metadata_relative_path": "all-sample/DGE_unfiltered/all_genes.csv",
"split_pipe_genome_dir": "/hpc/scratch/hdd1/mp674001/genomes/Homo_sapiens/NCBI/GRCh38/refdata_parse_biosciences/parse_biosciences_v2_1.1_index",
"split_pipe_html_basename": "all-sample_analysis_summary.html",
"split_pipe_kit": "WT_mini",
"split_pipe_mem_per_cpu": "50G",
"split_pipe_sample_list_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/configs/experimental_metadata.tsv",
"split_pipe_time": "4-00:00:00",
"split_pipe_web_summaries_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_outputs/split_pipe_web_summaries",
"tmp_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files/tmp",
"workflow_configs_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/configs/processing_config.csv",
"workflow_files_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_files",
"workflow_outputs_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/workflow_outputs",
"write_fastq_files_table_script": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/processing/scripts/write_fastq_files_table.awk"
}
},
"this": {
"analysis_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/analysis",
"configs_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/analysis/configs/1_stats_qc_filtering_config.csv",
"file_name_pattern": "_config.csv",
"id": "1_stats_qc_filtering",
"irms_subm_metadata": {
"CR138AE": "TAP-seq",
"CR5AE": "Gex"
},
"plots_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/analysis/plots/1_stats_qc_filtering",
"primers_file": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/analysis/configs/primers_sequences.csv",
"project_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot",
"python_binary_objects_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/analysis/python_binary_objects",
"results_dir": "/hpc/scratch/hdd2/mp674001/projects/2024_01_parse_TAP-seq_human_pilot/analysis/results"
}
}
ENVIRONMENT
{
"ADDR2LINE": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-addr2line",
"ANACONDA3_ROOT": "/hpc/apps/current/anaconda3/v5.0.1b.app",
"AR": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-ar",
"AS": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-as",
"BASH_FUNC_create_passwd()": "() { tr -cd 'a-zA-Z0-9' < /dev/urandom 2> /dev/null | head -c${1:-8}\n}",
"BASH_FUNC_module()": "() { eval `/usr/bin/modulecmd bash $*`\n}",
"BASH_FUNC_random_number()": "() { shuf -i ${1}-${2} -n 1\n}",
"BUILD": "x86_64-conda-linux-gnu",
"CC": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-cc",
"CC_FOR_BUILD": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-cc",
"CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"CLICOLOR": "1",
"CLUSTER_LOCAL": "ushpc",
"CMAKE_ARGS": "-DCMAKE_AR=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-ar -DCMAKE_CXX_COMPILER_AR=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_C_COMPILER_AR=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_RANLIB=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-ranlib -DCMAKE_CXX_COMPILER_RANLIB=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_C_COMPILER_RANLIB=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_LINKER=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-strip",
"CMAKE_PREFIX_PATH": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1:/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/x86_64-conda-linux-gnu/sysroot/usr",
"CONDA_BUILD_SYSROOT": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/x86_64-conda-linux-gnu/sysroot",
"CONDA_DEFAULT_ENV": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1",
"CONDA_ENVS_PATH": "/hpc/scratch/hdd1/mp674001/software/conda_envs:/hpc/projects/upt/hgcb/shared/iTREGscreen/software/conda_environments:/hpc/scratch/hdd2/mp674001/software/conda_envs",
"CONDA_EXE": "/hpc/apps/2018/anaconda3/v5.0.1b.app/bin/conda",
"CONDA_PREFIX": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1",
"CONDA_PROMPT_MODIFIER": "(/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1) ",
"CONDA_PYTHON_EXE": "/hpc/apps/2018/anaconda3/v5.0.1b.app/bin/python",
"CONDA_ROOT": "/hpc/apps/2018/anaconda3/v5.0.1b.app",
"CONDA_SHLVL": "1",
"CONDA_TOOLCHAIN_BUILD": "x86_64-conda-linux-gnu",
"CONDA_TOOLCHAIN_HOST": "x86_64-conda-linux-gnu",
"CONFIG_FILE": "/home/mp674001/ondemand/data/sys/dashboard/batch_connect/sys/jupyter-lab/output/8932db07-3953-4232-bf13-f768b195c70b/config.py",
"CPATH": "/cm/shared/slurm/current/include:/cm/shared/slurm/current/include",
"CPP": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-cpp",
"CPPFLAGS": "-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"CVS_RSH": "ssh",
"CXX": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-c++",
"CXXFILT": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-c++filt",
"CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"CXX_FOR_BUILD": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-c++",
"DEBUG_CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"DEBUG_CPPFLAGS": "-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"DEBUG_CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"DEBUG_FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe",
"DEBUG_FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe",
"ELFEDIT": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-elfedit",
"ENVIRONMENT": "BATCH",
"F77": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gfortran",
"F90": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gfortran",
"F95": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-f95",
"FC": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gfortran",
"FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/include",
"GCC": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc",
"GCC_AR": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc-ar",
"GCC_NM": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc-nm",
"GCC_RANLIB": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gcc-ranlib",
"GFORTRAN": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gfortran",
"GIT_PAGER": "cat",
"GPROF": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-gprof",
"GSETTINGS_SCHEMA_DIR": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/share/glib-2.0/schemas",
"GSETTINGS_SCHEMA_DIR_CONDA_BACKUP": "",
"GSK_APPNAME": "ANACONDA3",
"GSK_APPS_ROOT": "/hpc/apps/current",
"GSK_ARCH": "x86_64",
"GSK_CLUSTER": "USHPC",
"GSK_DISTRO": "RHEL-7",
"GSK_MODULES_ROOT": "/hpc/modules/current",
"GSK_SITE": "UPT",
"GXX": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-g++",
"HISTCONTROL": "ignoredups",
"HISTSIZE": "1000",
"HOME": "/home/mp674001",
"HOST": "x86_64-conda-linux-gnu",
"HOSTNAME": "cpu-314",
"JPY_PARENT_PID": "84422",
"KMP_DUPLICATE_LIB_OK": "True",
"KMP_INIT_AT_FORK": "FALSE",
"KRB5CCNAME": "/home/mp674001/.krb5cc",
"LANG": "en_US.UTF-8",
"LD": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-ld",
"LDFLAGS": "-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/lib -Wl,-rpath-link,/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/lib -L/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/lib",
"LD_GOLD": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-ld.gold",
"LD_LIBRARY_PATH": "/cm/local/apps/gcc/8.2.0/lib:/cm/local/apps/gcc/8.2.0/lib64:/cm/shared/slurm/current/lib/slurm:/cm/shared/slurm/current/lib:/cm/shared/slurm/current/lib/slurm:/cm/shared/slurm/current/lib",
"LESSOPEN": "||/usr/bin/lesspipe.sh %s",
"LIBRARY_PATH": "/cm/shared/slurm/current/lib/slurm:/cm/shared/slurm/current/lib:/cm/shared/slurm/current/lib/slurm:/cm/shared/slurm/current/lib",
"LOADEDMODULES": "anaconda3/5.0.1b:gcc/8.2.0",
"LOCAL_SCRATCH": "/local_scratch/mp674001/2546311",
"LOGNAME": "mp674001",
"MAIL": "/var/spool/mail/mp674001",
"MANPATH": "/cm/shared/slurm/current/share/man:/cm/shared/slurm/current/share/man:/usr/local/man:/usr/local/share/man:/usr/share/man/overrides:/usr/share/man:/cm/local/apps/environment-modules/current/share/man:/cm/local/apps/environment-modules/current/share/man",
"MODULEPATH": "/cm/local/modulefiles:/cm/shared/modulefiles:/hpc/modules/current/applications:/hpc/modules/current/compilers:/hpc/modules/current/libs:/hpc/modules/current/sets",
"MODULESHOME": "/usr/share/Modules",
"MPLBACKEND": "module://matplotlib_inline.backend_inline",
"MYDATA": "/hpc/mydata/upt/mp674001",
"NM": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-nm",
"NOTEBOOK_ROOT": "/home/mp674001",
"NO_AT_BRIDGE": "1",
"OBJCOPY": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-objcopy",
"OBJDUMP": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-objdump",
"OLDPWD": "/home/mp674001/ondemand/data/sys/dashboard/batch_connect/sys/jupyter-lab/output/8932db07-3953-4232-bf13-f768b195c70b",
"PAGER": "cat",
"PATH": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin:/hpc/apps/2018/anaconda3/v5.0.1b.app/condabin:/cm/local/apps/gcc/8.2.0/bin:/cm/shared/slurm/current/bin:/hpc/apps/current/anaconda3/v5.0.1b.app/bin:/cm/shared/slurm/current/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/mp674001/.dotnet/tools:/cm/shared/slurm/utils:/home/mp674001/.local/bin:/home/mp674001/bin:/home/mp674001/.dotnet/tools",
"PWD": "/home/mp674001",
"PYDEVD_USE_FRAME_EVAL": "NO",
"QTDIR": "/usr/lib64/qt-3.3",
"QTINC": "/usr/lib64/qt-3.3/include",
"QTLIB": "/usr/lib64/qt-3.3/lib",
"QT_GRAPHICSSYSTEM_CHECKED": "1",
"RANLIB": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-ranlib",
"READELF": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-readelf",
"REQUESTS_CA_BUNDLE": "/etc/pki/tls/certs/ca-bundle.trust.crt",
"RSTUDIO_WHICH_R": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/R",
"SHELL": "/bin/bash",
"SHLVL": "4",
"SITE_LOCAL": "UPT",
"SIZE": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-size",
"SLURMD_NODENAME": "cpu-314",
"SLURM_CLUSTER_NAME": "ushpc",
"SLURM_CONF": "//cm/shared/slurm/etc/slurm.conf",
"SLURM_CPUS_ON_NODE": "4",
"SLURM_CPUS_PER_TASK": "4",
"SLURM_EXPORT_ENV": "NONE",
"SLURM_GET_USER_ENV": "1",
"SLURM_GTIDS": "0",
"SLURM_JOBID": "2546311",
"SLURM_JOB_ACCOUNT": "default",
"SLURM_JOB_CPUS_PER_NODE": "4",
"SLURM_JOB_END_TIME": "1707033808",
"SLURM_JOB_GID": "1028",
"SLURM_JOB_ID": "2546311",
"SLURM_JOB_NAME": "sys/dashboard/sys/jupyter-lab",
"SLURM_JOB_NODELIST": "cpu-314",
"SLURM_JOB_NUM_NODES": "1",
"SLURM_JOB_PARTITION": "cpu",
"SLURM_JOB_QOS": "normal",
"SLURM_JOB_START_TIME": "1706515408",
"SLURM_JOB_UID": "19840",
"SLURM_JOB_USER": "mp674001",
"SLURM_LOCALID": "0",
"SLURM_MEM_PER_CPU": "61440",
"SLURM_NNODES": "1",
"SLURM_NODEID": "0",
"SLURM_NODELIST": "cpu-314",
"SLURM_NODE_ALIASES": "(null)",
"SLURM_NPROCS": "1",
"SLURM_NTASKS": "1",
"SLURM_PRIO_PROCESS": "0",
"SLURM_PROCID": "0",
"SLURM_SCRIPT_CONTEXT": "prolog_task",
"SLURM_SUBMIT_DIR": "/var/www/ood/apps/sys/dashboard",
"SLURM_SUBMIT_HOST": "ushpc-ondemand.gsk.com",
"SLURM_TASKS_PER_NODE": "1",
"SLURM_TASK_PID": "84374",
"SLURM_TOPOLOGY_ADDR": "cpu-314",
"SLURM_TOPOLOGY_ADDR_PATTERN": "node",
"SLURM_WORKING_CLUSTER": "ushpc:ushpc:6917:9984:109",
"SSH_ASKPASS": "/usr/libexec/openssh/gnome-ssh-askpass",
"STRINGS": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-strings",
"STRIP": "/hpc/scratch/hdd1/mp674001/software/conda_envs/python-3.9_r-4.1/bin/x86_64-conda-linux-gnu-strip",
"TERM": "xterm-color",
"TMPDIR": "/tmp",
"USER": "mp674001",
"XDG_DATA_DIRS": "/home/mp674001/.local/share/flatpak/exports/share:/var/lib/flatpak/exports/share:/usr/local/share:/usr/share",
"_": "/bin/env\n",
"_CE_CONDA": "",
"_CE_M": "",
"_CONDA_PYTHON_SYSCONFIGDATA_NAME": "_sysconfigdata_x86_64_conda_cos6_linux_gnu",
"_LMFILES_": "/hpc/modules/current/applications/anaconda3/5.0.1b:/cm/local/modulefiles/gcc/8.2.0",
"build_alias": "x86_64-conda-linux-gnu",
"gsk_cluster": "ushpc",
"gsk_site": "upt",
"host": "cpu-314.cm.storage",
"host_alias": "x86_64-conda-linux-gnu",
"port": "52409"
}
def generate_anndata_object(directory) -> sc.AnnData:
import anndata
files = os.listdir(directory)
files = list(map(lambda x: os.path.join(directory, x), files))
adata_objs = list(map(sc.read_h5ad, files))
adata_obj = anndata.concat(adata_objs)
adata_obj.var.loc[:, 'gene_symbol'] = adata_objs[0].var.loc[:, 'gene_name']
adata_obj.obs = adata_obj.obs.rename({'mread_count': 'read_count'}, axis=1)
return adata_obj
adata_obj = generate_anndata_object(
configs['imported']['processing']['anndata_objects_dir']
)
adata_obj
AnnData object with n_obs × n_vars = 277082 × 36601
obs: 'sample_name', 'gene_count', 'read_count'
var: 'gene_symbol'
adata_obj.X
<277082x36601 sparse matrix of type '<class 'numpy.float32'>' with 42258620 stored elements in Compressed Sparse Row format>
def add_qc_metrics(adata_obj: sc.AnnData) -> sc.AnnData:
adata_obj.var['mitochondrial'] = adata_obj.var.gene_symbol.str.startswith('MT-')
sc.pp.calculate_qc_metrics(
adata_obj, qc_vars=['mitochondrial'],
percent_top=None, log1p=False, inplace=True
)
adata_obj.obs = adata_obj.obs.rename({'total_counts': 'UMI_count'}, axis=1)
return adata_obj
adata_obj = add_qc_metrics(adata_obj)
adata_obj.var
| gene_symbol | mitochondrial | n_cells_by_counts | mean_counts | pct_dropout_by_counts | total_counts | |
|---|---|---|---|---|---|---|
| gene_id | ||||||
| ENSG00000000003 | TSPAN6 | False | 8 | 0.000029 | 99.997113 | 8.0 |
| ENSG00000000005 | TNMD | False | 0 | 0.000000 | 100.000000 | 0.0 |
| ENSG00000000419 | DPM1 | False | 2894 | 0.013310 | 98.955544 | 3688.0 |
| ENSG00000000457 | SCYL3 | False | 2078 | 0.008932 | 99.250042 | 2475.0 |
| ENSG00000000460 | C1orf112 | False | 3705 | 0.019088 | 98.662851 | 5289.0 |
| ... | ... | ... | ... | ... | ... | ... |
| ENSG00000288380 | CRIPAK | False | 95 | 0.000346 | 99.965714 | 96.0 |
| ENSG00000288398 | AL109627.1 | False | 71 | 0.000267 | 99.974376 | 74.0 |
| ENSG00000288436 | AC024558.2 | False | 63 | 0.000249 | 99.977263 | 69.0 |
| ENSG00000288459 | AL512357.2 | False | 0 | 0.000000 | 100.000000 | 0.0 |
| ENSG00000288460 | AL138899.3 | False | 0 | 0.000000 | 100.000000 | 0.0 |
36601 rows × 6 columns
adata_obj.obs
| sample_name | gene_count | read_count | n_genes_by_counts | UMI_count | total_counts_mitochondrial | pct_counts_mitochondrial | |
|---|---|---|---|---|---|---|---|
| cell_barcode | |||||||
| 02_01_01__CR138AE1P1 | CD4_Tm49_v1 | 13 | 28 | 13 | 16.0 | 0.0 | 0.000000 |
| 02_01_02__CR138AE1P1 | CD4_Tm49_v1 | 12 | 20 | 12 | 14.0 | 0.0 | 0.000000 |
| 02_01_03__CR138AE1P1 | CD4_Tm49_v1 | 13 | 150 | 13 | 19.0 | 0.0 | 0.000000 |
| 02_01_04__CR138AE1P1 | CD4_Tm49_v1 | 16 | 47 | 16 | 23.0 | 0.0 | 0.000000 |
| 02_01_05__CR138AE1P1 | CD4_Tm49_v1 | 455 | 53690 | 455 | 739.0 | 0.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 11_96_92__CR5AE1P1 | T-cells_repl-2_v2 | 43 | 46 | 43 | 43.0 | 0.0 | 0.000000 |
| 11_96_93__CR5AE1P1 | T-cells_repl-2_v2 | 35 | 39 | 35 | 35.0 | 0.0 | 0.000000 |
| 11_96_94__CR5AE1P1 | T-cells_repl-2_v2 | 41 | 47 | 41 | 42.0 | 1.0 | 2.380952 |
| 11_96_95__CR5AE1P1 | T-cells_repl-2_v2 | 47 | 53 | 47 | 48.0 | 1.0 | 2.083333 |
| 11_96_96__CR5AE1P1 | T-cells_repl-2_v2 | 71 | 94 | 71 | 76.0 | 1.0 | 1.315789 |
277082 rows × 7 columns
def add_obs_metadata(obs, irms_subm_metadata) -> None:
cell_barcode_metadata = (
obs
.index.to_series()
.str.extract(r'(?P<parse_barcode>.*)__(?P<irms_id>.*)')
)
irms_metadata = (
cell_barcode_metadata
.loc[:, 'irms_id']
.str.extract(r'(?P<irms_subm>[A-Z]+\d+[A-Z]+)(?P<sublib>\d+)P.*')
)
obs = pd.concat([cell_barcode_metadata, irms_metadata, obs], axis=1)
obs.loc[:, 'library_type'] = obs.loc[:, 'irms_subm'].map(irms_subm_metadata)
obs = Utils.str_to_category(obs)
return obs
adata_obj.obs = add_obs_metadata(
adata_obj.obs,
configs['this']['irms_subm_metadata']
)
adata_obj.obs
| parse_barcode | irms_id | irms_subm | sublib | library_type | sample_name | gene_count | read_count | n_genes_by_counts | UMI_count | total_counts_mitochondrial | pct_counts_mitochondrial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| cell_barcode | ||||||||||||
| 02_01_01__CR138AE1P1 | 02_01_01 | CR138AE1P1 | CR138AE | 1 | TAP-seq | CD4_Tm49_v1 | 13 | 28 | 13 | 16.0 | 0.0 | 0.000000 |
| 02_01_02__CR138AE1P1 | 02_01_02 | CR138AE1P1 | CR138AE | 1 | TAP-seq | CD4_Tm49_v1 | 12 | 20 | 12 | 14.0 | 0.0 | 0.000000 |
| 02_01_03__CR138AE1P1 | 02_01_03 | CR138AE1P1 | CR138AE | 1 | TAP-seq | CD4_Tm49_v1 | 13 | 150 | 13 | 19.0 | 0.0 | 0.000000 |
| 02_01_04__CR138AE1P1 | 02_01_04 | CR138AE1P1 | CR138AE | 1 | TAP-seq | CD4_Tm49_v1 | 16 | 47 | 16 | 23.0 | 0.0 | 0.000000 |
| 02_01_05__CR138AE1P1 | 02_01_05 | CR138AE1P1 | CR138AE | 1 | TAP-seq | CD4_Tm49_v1 | 455 | 53690 | 455 | 739.0 | 0.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11_96_92__CR5AE1P1 | 11_96_92 | CR5AE1P1 | CR5AE | 1 | Gex | T-cells_repl-2_v2 | 43 | 46 | 43 | 43.0 | 0.0 | 0.000000 |
| 11_96_93__CR5AE1P1 | 11_96_93 | CR5AE1P1 | CR5AE | 1 | Gex | T-cells_repl-2_v2 | 35 | 39 | 35 | 35.0 | 0.0 | 0.000000 |
| 11_96_94__CR5AE1P1 | 11_96_94 | CR5AE1P1 | CR5AE | 1 | Gex | T-cells_repl-2_v2 | 41 | 47 | 41 | 42.0 | 1.0 | 2.380952 |
| 11_96_95__CR5AE1P1 | 11_96_95 | CR5AE1P1 | CR5AE | 1 | Gex | T-cells_repl-2_v2 | 47 | 53 | 47 | 48.0 | 1.0 | 2.083333 |
| 11_96_96__CR5AE1P1 | 11_96_96 | CR5AE1P1 | CR5AE | 1 | Gex | T-cells_repl-2_v2 | 71 | 94 | 71 | 76.0 | 1.0 | 1.315789 |
277082 rows × 12 columns
def add_var_metadata(var) -> pd.DataFrame:
amplified_map = {True: 'amplified', False: 'non-amplified'}
gene_symbols = pd.read_csv(configs['this']['primers_file']).gene_symbol
var = (
var.copy()
.assign(amplified = lambda x: x.gene_symbol.isin(gene_symbols).map(amplified_map))
.pipe(Utils.str_to_category)
)
return var
adata_obj.var = add_var_metadata(adata_obj.var)
adata_obj.var
| amplified | gene_symbol | mitochondrial | n_cells_by_counts | mean_counts | pct_dropout_by_counts | total_counts | |
|---|---|---|---|---|---|---|---|
| gene_id | |||||||
| ENSG00000000003 | non-amplified | TSPAN6 | False | 8 | 0.000029 | 99.997113 | 8.0 |
| ENSG00000000005 | non-amplified | TNMD | False | 0 | 0.000000 | 100.000000 | 0.0 |
| ENSG00000000419 | non-amplified | DPM1 | False | 2894 | 0.013310 | 98.955544 | 3688.0 |
| ENSG00000000457 | non-amplified | SCYL3 | False | 2078 | 0.008932 | 99.250042 | 2475.0 |
| ENSG00000000460 | non-amplified | C1orf112 | False | 3705 | 0.019088 | 98.662851 | 5289.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| ENSG00000288380 | non-amplified | CRIPAK | False | 95 | 0.000346 | 99.965714 | 96.0 |
| ENSG00000288398 | non-amplified | AL109627.1 | False | 71 | 0.000267 | 99.974376 | 74.0 |
| ENSG00000288436 | non-amplified | AC024558.2 | False | 63 | 0.000249 | 99.977263 | 69.0 |
| ENSG00000288459 | non-amplified | AL512357.2 | False | 0 | 0.000000 | 100.000000 | 0.0 |
| ENSG00000288460 | non-amplified | AL138899.3 | False | 0 | 0.000000 | 100.000000 | 0.0 |
36601 rows × 7 columns
def import_read_counts(directory, name) -> pd.Series:
files = os.listdir(directory)
counts = {}
for file in files:
file = f'{directory}/{file}'
irms_id = os.path.basename(file).split('.')[0]
with open(file, 'r') as f:
counts[irms_id] = int(f.read().strip())
counts = pd.Series(counts).rename(name)
return counts
def extract_total_counts(adata_obj) -> None:
bam_counts: pd.Series = import_read_counts(
configs['imported']['processing']['count_bam_files_lines_outputs_dir'],
'bam_file_alignments'
)
fastq_counts: pd.Series = import_read_counts(
configs['imported']['processing']['count_fastq_files_lines_outputs_dir'],
'fastq_file_reads'
)
library_spec_cols = ['irms_id']
umi_counts: pd.DataFrame = (
adata_obj.obs
.groupby(library_spec_cols)
[['UMI_count']]
.sum()
)
read_counts: pd.DataFrame = (
adata_obj.obs
.groupby(library_spec_cols)
[['read_count']]
.sum()
)
total_counts = (
umi_counts
.merge(read_counts, left_index=True, right_index=True)
.merge(bam_counts, left_index=True, right_index=True)
.merge(fastq_counts, left_index=True, right_index=True)
.rename_axis(['irms_id'], axis=0)
)
return total_counts
total_counts: pd.DataFrame = extract_total_counts(adata_obj)
total_counts
| UMI_count | read_count | bam_file_alignments | fastq_file_reads | |
|---|---|---|---|---|
| irms_id | ||||
| CR138AE1P1 | 4208067.0 | 141927615 | 167320989 | 218569999 |
| CR138AE2P2 | 4064390.0 | 161176252 | 188891430 | 246578307 |
| CR5AE1P1 | 20461556.0 | 24327785 | 48238817 | 118552710 |
| CR5AE2P2 | 19957012.0 | 24269663 | 46262511 | 113149536 |
def plot_read_counts(total_counts: pd.DataFrame, figsize=(3, 4)) -> None:
import seaborn as sns
obj = (
total_counts
.melt(ignore_index=False, var_name='stat', value_name='count')
.reset_index()
)
fig, ax = plt.subplots(1, 1, figsize=figsize)
sns.barplot(data=obj, x='irms_id', y='count', hue='stat',ax=ax)
ax.tick_params(axis = 'x', labelrotation = 90, bottom = False)
ax.legend(bbox_to_anchor = (1, 1))
ax.spines['bottom'].set_visible(False)
file_name = f"{configs['this']['plots_dir']}/barplot__read_counts.jpg"
fig.savefig(file_name)
plot_read_counts(total_counts)
total_counts.sum()/1e6
UMI_count 72.538848 read_count 392.824559 bam_file_alignments 450.713747 fastq_file_reads 696.850552 dtype: float64
def extract_obs_read_tab(adata_obj) -> pd.DataFrame:
obs_read_tab = (
adata_obj.obs
.pivot(
index=['parse_barcode', 'sublib'],
columns='library_type',
values='UMI_count'
)
.dropna()
)
return obs_read_tab
obs_read_tab = extract_obs_read_tab(adata_obj)
obs_read_tab
| library_type | Gex | TAP-seq | |
|---|---|---|---|
| parse_barcode | sublib | ||
| 01_01_05 | 1 | 288.0 | 22.0 |
| 2 | 113.0 | 10.0 | |
| 01_01_07 | 1 | 218.0 | 18.0 |
| 2 | 212.0 | 34.0 | |
| 01_01_08 | 1 | 149.0 | 13.0 |
| ... | ... | ... | ... |
| 12_96_93 | 1 | 152.0 | 22.0 |
| 2 | 66.0 | 14.0 | |
| 12_96_94 | 1 | 290.0 | 38.0 |
| 12_96_95 | 1 | 188.0 | 24.0 |
| 2 | 116.0 | 15.0 |
81518 rows × 2 columns
def generate_cluster_labels(model) -> dict:
means = model.means_.sum(axis=1)
labels_map = {np.argmin(means): 'non-cell', np.argmax(means): 'cell'}
return labels_map
def identify_cells(obs_read_tab: pd.DataFrame) -> np.ndarray:
import sklearn.mixture
X = np.log1p(obs_read_tab)
model = sklearn.mixture.GaussianMixture(
n_components=2, random_state=0, covariance_type='tied'
)
model.fit(X)
clusters = pd.Series(model.predict(X), index=X.index)
labels_map = generate_cluster_labels(model)
labels = clusters.map(labels_map)
return labels
obs_read_tab.loc[:, 'cluster'] = identify_cells(obs_read_tab)
obs_read_tab
| library_type | Gex | TAP-seq | cluster | |
|---|---|---|---|---|
| parse_barcode | sublib | |||
| 01_01_05 | 1 | 288.0 | 22.0 | non-cell |
| 2 | 113.0 | 10.0 | non-cell | |
| 01_01_07 | 1 | 218.0 | 18.0 | non-cell |
| 2 | 212.0 | 34.0 | non-cell | |
| 01_01_08 | 1 | 149.0 | 13.0 | non-cell |
| ... | ... | ... | ... | ... |
| 12_96_93 | 1 | 152.0 | 22.0 | non-cell |
| 2 | 66.0 | 14.0 | non-cell | |
| 12_96_94 | 1 | 290.0 | 38.0 | non-cell |
| 12_96_95 | 1 | 188.0 | 24.0 | non-cell |
| 2 | 116.0 | 15.0 | non-cell |
81518 rows × 3 columns
obs_read_tab.cluster.value_counts()
non-cell 74150 cell 7368 Name: cluster, dtype: int64
def plot_obs_correlation_gex_tap(
tab: pd.DataFrame,
x_label, y_label,
figsize=(6,6)
) -> None:
x = tab.loc[:, x_label].values
y = tab.loc[:, y_label].values
fig = plt.figure(figsize=figsize)
gs = fig.add_gridspec(
2,2,
width_ratios = (4,1),
height_ratios = (1,4),
wspace=0, hspace=0
)
ax_sca = fig.add_subplot(gs[1,0])
ax_hist_x = fig.add_subplot(gs[0,0], sharex=ax_sca)
ax_hist_y = fig.add_subplot(gs[1,1], sharey=ax_sca)
ax_sca.scatter(x, y, alpha=0.5, c='grey',edgecolor='none', s=0.5, label='non-cell')
ax_sca.scatter(
obs_read_tab.query('cluster == "cell"').loc[:, x_label],
obs_read_tab.query('cluster == "cell"').loc[:, y_label],
alpha=0.5, c='tab:blue', edgecolor='none', s=0.5,
label='cell'
)
ax_sca.set_xlim(5, None)
ax_sca.set_ylim(5, None)
ax_sca.set_yscale('log')
ax_sca.set_xscale('log')
ax_sca.set_xlabel(x_label)
ax_sca.set_ylabel(y_label)
ax_sca.legend(markerscale=20)
x_bins = np.logspace(0, np.log10(np.max(x)), num=100, base=10)
ax_hist_x.hist(x, bins=x_bins, color='grey')
ax_hist_x.axis('off')
y_bins = np.logspace(0, np.log10(np.max(y)), num=100, base=10)
ax_hist_y.hist(y, bins=y_bins, color='grey', orientation='horizontal')
ax_hist_y.axis('off')
fig.suptitle('Total UMI / cell')
file_name = f"{configs['this']['plots_dir']}/cell_gex_vs_tap_correlation.jpg"
fig.savefig(file_name)
plot_obs_correlation_gex_tap(
obs_read_tab,
x_label='Gex',
y_label='TAP-seq'
)
def filter_cell_barcodes(
adata_obj, obs_read_tab
) -> pd.Series:
obs_read_tab = (
obs_read_tab
.query('cluster == "cell"')
.drop('cluster', axis=1)
.melt(ignore_index=False)
.drop('value', axis=1)
.assign(is_cell=1)
.reset_index()
)
id_cols = ['parse_barcode', 'sublib', 'library_type']
cell_barcodes = (
adata_obj.obs
.reset_index()
.merge(obs_read_tab, on=id_cols) # inner join (only the filtered barcodes are kept)
.cell_barcode
.tolist()
)
return cell_barcodes
adata_obj = adata_obj[
filter_cell_barcodes(adata_obj, obs_read_tab),
:
]
adata_obj
View of AnnData object with n_obs × n_vars = 14736 × 36601
obs: 'parse_barcode', 'irms_id', 'irms_subm', 'sublib', 'library_type', 'sample_name', 'gene_count', 'read_count', 'n_genes_by_counts', 'UMI_count', 'total_counts_mitochondrial', 'pct_counts_mitochondrial'
var: 'amplified', 'gene_symbol', 'mitochondrial', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
adata_obj.write_h5ad(f"{configs['this']['python_binary_objects_dir']}/adata_obj_cells_filtered.h5ad")
adata_obj = sc.read_h5ad(f"{configs['this']['python_binary_objects_dir']}/adata_obj_cells_filtered.h5ad")
pd.crosstab(adata_obj.obs.library_type, adata_obj.obs.sublib)
| sublib | 1 | 2 |
|---|---|---|
| library_type | ||
| Gex | 3888 | 3480 |
| TAP-seq | 3888 | 3480 |
def assemble_UMI_count_data_frame(adata_obj, library_spec_cols: list) -> pd.DataFrame:
index = pd.MultiIndex.from_frame(
adata_obj.obs.loc[:, library_spec_cols]
)
columns = (
adata_obj.var
.loc[:, ['gene_symbol', 'amplified']]
.reset_index()
)
columns = pd.MultiIndex.from_frame(columns)
X = pd.DataFrame(
adata_obj.X.toarray(),
columns=columns,
index=index
)
return X
def extract_var_read_tab(adata_obj) -> pd.DataFrame:
library_spec_cols = ['library_type', 'sublib']
X: pd.DataFrame = assemble_UMI_count_data_frame(adata_obj, library_spec_cols)
var_read_tab = (
X
.groupby(library_spec_cols)
.agg('sum')
.T
.stack(level='sublib')
.loc[lambda x: x.sum(axis=1).gt(0), :]
)
return var_read_tab
var_read_tab = extract_var_read_tab(adata_obj)
var_read_tab
| library_type | Gex | TAP-seq | |||
|---|---|---|---|---|---|
| gene_id | gene_symbol | amplified | sublib | ||
| ENSG00000000003 | TSPAN6 | non-amplified | 1 | 0.0 | 2.0 |
| 2 | 3.0 | 1.0 | |||
| ENSG00000000419 | DPM1 | non-amplified | 1 | 1160.0 | 78.0 |
| 2 | 1144.0 | 65.0 | |||
| ENSG00000000457 | SCYL3 | non-amplified | 1 | 733.0 | 51.0 |
| ... | ... | ... | ... | ... | ... |
| ENSG00000288380 | CRIPAK | non-amplified | 2 | 32.0 | 3.0 |
| ENSG00000288398 | AL109627.1 | non-amplified | 1 | 25.0 | 4.0 |
| 2 | 15.0 | 5.0 | |||
| ENSG00000288436 | AC024558.2 | non-amplified | 1 | 16.0 | 2.0 |
| 2 | 22.0 | 3.0 |
54039 rows × 2 columns
def plot_var_correlation_gex_tap(tab, x_label, y_label, figsize=(6,6)) -> None:
x = tab.loc[:, x_label].values
y = tab.loc[:, y_label].values
fig = plt.figure(figsize=figsize)
gs = fig.add_gridspec(
2,2,
width_ratios = (4,1),
height_ratios = (1,4),
wspace=0, hspace=0
)
ax_sca = fig.add_subplot(gs[1,0])
ax_hist_x = fig.add_subplot(gs[0,0], sharex=ax_sca)
ax_hist_y = fig.add_subplot(gs[1,1], sharey=ax_sca)
ax_sca.scatter(x, y, alpha=0.5, c='grey',edgecolor='none', s=0.5)
ax_sca.scatter(
tab.query('amplified == "amplified"').loc[:, x_label],
tab.query('amplified == "amplified"').loc[:, y_label],
c='tab:blue', edgecolor='none', s=20, label='amplified'
)
max_value = np.max(tab.values.flatten())
ax_sca.plot(
[10, max_value], [10, max_value],
color='grey', linestyle='--'
)
ax_sca.set_yscale('log')
ax_sca.set_xscale('log')
ax_sca.set_xlabel(x_label)
ax_sca.set_ylabel(y_label)
ax_sca.legend()
x_bins = np.logspace(0, np.log10(np.max(x)), num=100, base=10)
ax_hist_x.hist(x, bins=x_bins, color='grey')
ax_hist_x.axis('off')
y_bins = np.logspace(0, np.log10(np.max(y)), num=100, base=10)
ax_hist_y.hist(y, bins=y_bins, color='grey', orientation='horizontal')
ax_hist_y.axis('off')
fig.suptitle('total UMI / (gene*library)')
file_name = f"{configs['this']['plots_dir']}/genes_gex_vs_tap_correlation.jpg"
fig.savefig(file_name)
plot_var_correlation_gex_tap(
var_read_tab, x_label='Gex', y_label='TAP-seq'
)
def get_gene_depth_fractions(tab) -> dict:
depth_fractions = (
tab
.melt(ignore_index=False, value_name='read_counts')
.set_index('library_type', append=True)
.groupby(['library_type', 'sublib'])
.transform(lambda x: x/x.sum())
.unstack(level=['library_type', 'sublib'])
.droplevel(level=0, axis=1)
.to_dict('list')
)
return depth_fractions
def plot_gene_depth_cumulative(tab: pd.DataFrame) -> None:
depth_fractions: dict = get_gene_depth_fractions(tab)
colors = {'Gex': 'grey', 'TAP-seq': 'tab:blue'}
fig, ax = plt.subplots(1,1, figsize=(5,5))
for (library_type, sublib), depth_fraction in depth_fractions.items():
x = np.arange(len(depth_fraction))
depth_fraction = np.sort(depth_fraction)
depth_cumul = np.cumsum(depth_fraction)
ax.plot(
x, depth_cumul,
c=colors[library_type],
label=f'{library_type}, {sublib}'
)
ax.set_yscale('log')
ax.set_xlabel('gene rank')
ax.set_ylabel('cumulative UMI fraction')
ax.legend()
file_name = f"{configs['this']['plots_dir']}/cumulative_UMI_fraction_by_library.jpg"
fig.savefig(file_name)
plot_gene_depth_cumulative(var_read_tab)
def get_umi_quantile(umi_by_library: dict, quantile: float) -> float:
umi_collapsed = itertools.chain.from_iterable(umi_by_library.values())
umi_collapsed = list(umi_collapsed)
umi_collapsed = np.array(umi_collapsed)
umi_quantile = np.quantile(umi_collapsed, quantile)
return umi_quantile
def gather_UMI_by_gene_library(adata_obj) -> dict:
adata_obj_subset = adata_obj[:, adata_obj.var.amplified.eq('amplified')]
index = (
adata_obj_subset.obs
.loc[:, ['library_type', 'sublib']]
.apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
.rename('library_id')
)
columns = adata_obj_subset.var.loc[:, 'gene_symbol']
X = adata_obj_subset.X.toarray()
X = pd.DataFrame(X, index=index, columns=columns)
umi: dict = X.groupby('library_id').agg(list).to_dict()
return umi
def boxplot_UMI_by_library(
gene_symbol: str,
umi_by_library: dict,
figsize=(2,3)
) -> None:
fig, ax = plt.subplots(1,1, figsize=figsize)
plot = ax.violinplot(umi_by_library.values(), widths=0.8,showextrema=False)
for pc in plot['bodies']:
pc.set_facecolor('grey')
max_ylim = get_umi_quantile(umi_by_library, 0.99)
labels = umi_by_library.keys()
ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels)
ax.tick_params(which="major", bottom=False, left=False)
ax.tick_params(axis='x', labelrotation=90)
ax.set_title(gene_symbol)
ax.set_ylabel('UMI')
ax.set_ylim(0, max_ylim)
ax.spines['bottom'].set_visible(False)
file_name = f"{configs['this']['plots_dir']}/boxplot_UMI_by_library_id_{gene_symbol}.jpg"
fig.savefig(file_name)
_ = list(map(
boxplot_UMI_by_library,
*zip(*gather_UMI_by_gene_library(adata_obj).items())
))
/tmp/ipykernel_145515/2266887416.py:38: UserWarning: Attempting to set identical bottom == top == 0 results in singular transformations; automatically expanding. ax.set_ylim(0, max_ylim) /tmp/ipykernel_145515/2266887416.py:27: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig, ax = plt.subplots(1,1, figsize=figsize)
def summarize(
stat_cols: list, stat: list,
index_cols: list, columns_cols: list,
adata_obj
) -> pd.DataFrame:
stats = (
adata_obj
.obs
.assign(cell=1) # just as a label for cell count on the plot
.pipe(Utils.category_to_str) # won't create empty combinations
.groupby(index_cols + columns_cols)
[stat_cols]
.agg(stat)
.sort_index()
.astype(np.int64)
.rename_axis(['variable', 'stat'], axis=1)
.unstack(columns_cols)
)
return stats
def summarize_wrapper() -> None:
stat_col_stat_items = [
[['cell'], ['count']],
[['read_count'], ['sum']],
[['read_count'], ['median']],
[['UMI_count'], ['sum']],
[['UMI_count'], ['median']],
[['gene_count'], ['median']],
]
names = map(
lambda level_1: '__'.join(map(lambda level_2: '_'.join(level_2), level_1)),
stat_col_stat_items
)
summarize_partial = functools.partial(
summarize,
adata_obj=adata_obj,
index_cols=['sample_name'],
columns_cols=['library_type', 'sublib']
)
summary = map(summarize_partial, *zip(*stat_col_stat_items))
summary = dict(zip(names, summary))
return summary
stats = summarize_wrapper()
stats
{'cell__count': variable cell
stat count
library_type Gex TAP-seq
sublib 1 2 1 2
sample_name
CD4_Tm49_v1 375 317 375 317
CD4_Tm56_v1 375 297 375 297
CD4_no-gRNA-enrichment_v1 154 122 154 122
NK92__v2 454 457 454 457
PBMC_new-fixation_v1 15 20 15 20
PBMC_new-fixation_v2 353 335 353 335
T-cells_TotalSeqA-control_v1 127 106 127 106
T-cells_TotalSeqA_v1 140 122 140 122
T-cells__glyoxal-fixed 905 863 905 863
T-cells__v1 127 112 127 112
T-cells_repl-1_v2 428 371 428 371
T-cells_repl-2_v2 435 358 435 358,
'read_count__sum': variable read_count
stat sum
library_type Gex TAP-seq
sublib 1 2 1 2
sample_name
CD4_Tm49_v1 1402665 1465681 7627830 9896444
CD4_Tm56_v1 1133136 1121960 6788524 6995670
CD4_no-gRNA-enrichment_v1 489716 525383 2584914 3152990
NK92__v2 5799159 5581361 30308837 32010756
PBMC_new-fixation_v1 28299 30689 107546 165278
PBMC_new-fixation_v2 1406498 1546425 4237461 4987698
T-cells_TotalSeqA-control_v1 441894 321376 2146871 1549416
T-cells_TotalSeqA_v1 369821 331929 1574127 1850437
T-cells__glyoxal-fixed 5728358 6138929 37862298 47312172
T-cells__v1 255551 303127 1378633 1917850
T-cells_repl-1_v2 3085262 3170225 21023566 25350379
T-cells_repl-2_v2 4187426 3732578 26287008 25987162,
'read_count__median': variable read_count
stat median
library_type Gex TAP-seq
sublib 1 2 1 2
sample_name
CD4_Tm49_v1 3069 3713 14606 22088
CD4_Tm56_v1 2427 2985 13399 17410
CD4_no-gRNA-enrichment_v1 2482 3171 13168 18500
NK92__v2 9597 5960 27823 22367
PBMC_new-fixation_v1 1341 1333 3757 3116
PBMC_new-fixation_v2 2573 3125 7103 8917
T-cells_TotalSeqA-control_v1 2452 2239 10928 9905
T-cells_TotalSeqA_v1 1760 2104 6293 9889
T-cells__glyoxal-fixed 3164 3106 17622 15979
T-cells__v1 1860 2446 7316 13226
T-cells_repl-1_v2 4967 5538 32336 37680
T-cells_repl-2_v2 6384 7160 38077 45417,
'UMI_count__sum': variable UMI_count
stat sum
library_type Gex TAP-seq
sublib 1 2 1 2
sample_name
CD4_Tm49_v1 1169796 1191289 240729 240379
CD4_Tm56_v1 939406 905479 211931 195368
CD4_no-gRNA-enrichment_v1 404958 423213 87025 88398
NK92__v2 4893377 4618710 1094368 1019596
PBMC_new-fixation_v1 23545 24883 4338 4520
PBMC_new-fixation_v2 1173200 1257672 205534 212616
T-cells_TotalSeqA-control_v1 364625 258718 78806 52917
T-cells_TotalSeqA_v1 306162 266540 72101 62282
T-cells__glyoxal-fixed 4862843 5108458 957204 1023200
T-cells__v1 213035 246042 43299 48348
T-cells_repl-1_v2 2584972 2591049 536824 538001
T-cells_repl-2_v2 3525638 3064960 675908 578765,
'UMI_count__median': variable UMI_count
stat median
library_type Gex TAP-seq
sublib 1 2 1 2
sample_name
CD4_Tm49_v1 2542 3013 539 616
CD4_Tm56_v1 1993 2422 464 526
CD4_no-gRNA-enrichment_v1 2034 2561 463 534
NK92__v2 8067 4956 1953 1166
PBMC_new-fixation_v1 1130 1088 197 183
PBMC_new-fixation_v2 2150 2562 369 410
T-cells_TotalSeqA-control_v1 2014 1815 422 385
T-cells_TotalSeqA_v1 1489 1671 338 397
T-cells__glyoxal-fixed 2697 2611 508 487
T-cells__v1 1537 1998 302 364
T-cells_repl-1_v2 4156 4560 896 943
T-cells_repl-2_v2 5421 5883 1066 1149,
'gene_count__median': variable gene_count
stat median
library_type Gex TAP-seq
sublib 1 2 1 2
sample_name
CD4_Tm49_v1 1614 1861 398 446
CD4_Tm56_v1 1337 1529 350 388
CD4_no-gRNA-enrichment_v1 1363 1610 349 389
NK92__v2 3669 2817 1244 793
PBMC_new-fixation_v1 869 817 171 160
PBMC_new-fixation_v2 1329 1494 299 326
T-cells_TotalSeqA-control_v1 1345 1213 328 297
T-cells_TotalSeqA_v1 1053 1105 264 292
T-cells__glyoxal-fixed 1748 1700 295 277
T-cells__v1 1097 1384 237 283
T-cells_repl-1_v2 2290 2300 584 593
T-cells_repl-2_v2 2711 2817 679 728}
def plot_heatmap(
obj: pd.DataFrame,
file_name: str = None,
vmin: float = None,
vmax: float = None,
cmap: str = 'Purples',
cmap_label: str = None,
title: str = None,
figsize_scalar = 0.4,
colorbar_shrink: float = 0.5,
colorbar_fraction: float = 0.3,
axis_labels: bool = False
) -> None:
import matplotlib.cm
import matplotlib.colors
if vmax is None:
vmax = np.max(obj.values)
if vmin is None:
vmin = np.min(obj.values)
cmap = matplotlib.cm.get_cmap(cmap)
cmap_norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
figsize = np.flip(np.array(obj.shape)) * figsize_scalar
fig = plt.figure(figsize = figsize)
ax = fig.add_subplot()
ax.imshow(obj.values, cmap = cmap, norm = cmap_norm)
ax.set_xticks(np.arange(obj.shape[1]))
ax.set_yticks(np.arange(obj.shape[0]))
ax.set_xticks((np.arange(obj.shape[1]) + 0.5)[:-1], minor=True)
ax.set_yticks((np.arange(obj.shape[0]) + 0.5)[:-1], minor=True)
ax.tick_params(which="minor", bottom=False, left=False)
ax.tick_params(which="major", bottom=False, left=False)
ax.grid(which='minor', color='white', linestyle='-', linewidth=2)
def simplify_labels(x: pd.Index) -> list:
if x.nlevels > 1:
x = x.to_frame().astype(np.str_).values.tolist()
labels = list(map(lambda x: ' | '.join(x), x))
else:
labels = x.to_series().tolist()
return labels
ax.set_xticklabels(simplify_labels(obj.columns), rotation = 90)
ax.set_yticklabels(simplify_labels(obj.index))
if axis_labels:
ax.set_ylabel(obj.index.name)
ax.set_xlabel(obj.columns.name)
list(map(lambda edge, spine: spine.set_visible(False), *zip(*ax.spines.items())))
if title is not None:
ax.set_title(title)
fig.colorbar(
matplotlib.cm.ScalarMappable(norm=cmap_norm, cmap=cmap),
ax = ax,
orientation='vertical',
label = cmap_label,
shrink = colorbar_shrink,
fraction = colorbar_fraction
)
if file_name is not None:
fig.savefig(file_name)
def plot_heatmaps(stats):
file_names = map(
lambda x: f"{configs['this']['plots_dir']}/{x}.jpg",
stats.keys()
)
plot_heatmap_partial = functools.partial(plot_heatmap, vmin=0, cmap='Spectral')
list(map(plot_heatmap_partial, stats.values(), file_names))
plot_heatmaps(stats)